Objectives

  • Deal with overplotting
  • Order bar chart
  • Zooming
  • Change labels, themes, and scales

1. Reading in data

In this section we’ll continue using CRC dataset.

library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)

CRC <- read.csv("./data/CRC_train.csv")
NCI60 <- read.csv("./data/NCI60.csv")
names(NCI60)
##   [1] "BR_BT549_a"     "BR_BT549_b"     "BR_HS578T_a"    "BR_HS578T_b"   
##   [5] "BR_MCF7_a"      "BR_MCF7_b"      "BR_MDAMB231_a"  "BR_MDAMB231_b" 
##   [9] "BR_MDAMB468_a"  "BR_MDAMB468_b"  "BR_T47D_a"      "BR_T47D_b"     
##  [13] "CNS_SF268_a"    "CNS_SF268_b"    "CNS_SF295_a"    "CNS_SF295_b"   
##  [17] "CNS_SF539_a"    "CNS_SF539_b"    "CNS_SNB19_a"    "CNS_SNB19_b"   
##  [21] "CNS_SNB75_a"    "CNS_SNB75_b"    "CNS_U251_a"     "CNS_U251_b"    
##  [25] "CO_COLO205_a"   "CO_COLO205_b"   "CO_HCC2998_a"   "CO_HCC2998_b"  
##  [29] "CO_HCT116_a"    "CO_HCT116_b"    "CO_HCT15_a"     "CO_HCT15_b"    
##  [33] "CO_HT29_a"      "CO_HT29_b"      "CO_KM12_a"      "CO_KM12_b"     
##  [37] "CO_SW620_a"     "CO_SW620_b"     "LC_A549_a"      "LC_A549_b"     
##  [41] "LC_EKVX_a"      "LC_EKVX_b"      "LC_HOP62_a"     "LC_HOP62_b"    
##  [45] "LC_HOP92_a"     "LC_HOP92_b"     "LC_NCIH226_a"   "LC_NCIH226_b"  
##  [49] "LC_NCIH23_a"    "LC_NCIH23_b"    "LC_NCIH322M_a"  "LC_NCIH322M_b" 
##  [53] "LC_NCIH460_a"   "LC_NCIH460_b"   "LC_NCIH522_a"   "LC_NCIH522_b"  
##  [57] "LE_CCRFCEM_a"   "LE_CCRFCEM_b"   "LE_HL60_a"      "LE_HL60_b"     
##  [61] "LE_K562_a"      "LE_K562_b"      "LE_MOLT4_a"     "LE_MOLT4_b"    
##  [65] "LE_RPMI8226_a"  "LE_RPMI8226_b"  "LE_SR_a"        "LE_SR_b"       
##  [69] "ME_LOXIMVI_a"   "ME_LOXIMVI_b"   "ME_M14_a"       "ME_M14_b"      
##  [73] "ME_MALME3M_a"   "ME_MALME3M_b"   "ME_MDAMB435_a"  "ME_MDAMB435_b" 
##  [77] "ME_SKMEL2_a"    "ME_SKMEL2_b"    "ME_SKMEL28_a"   "ME_SKMEL28_b"  
##  [81] "ME_SKMEL5_a"    "ME_SKMEL5_b"    "ME_UACC257_a"   "ME_UACC257_b"  
##  [85] "ME_UACC62_a"    "ME_UACC62_b"    "OV_IGROV1_a"    "OV_IGROV1_b"   
##  [89] "OV_NCIADRRES_a" "OV_NCIADRRES_b" "OV_OVCAR3_a"    "OV_OVCAR3_b"   
##  [93] "OV_OVCAR4_a"    "OV_OVCAR4_b"    "OV_OVCAR5_a"    "OV_OVCAR5_b"   
##  [97] "OV_OVCAR8_a"    "OV_OVCAR8_b"    "OV_SKOV3_a"     "OV_SKOV3_b"    
## [101] "PR_DU145_a"     "PR_DU145_b"     "PR_PC3_a"       "PR_PC3_b"      
## [105] "RE_7860_a"      "RE_7860_b"      "RE_A498_a"      "RE_A498_b"     
## [109] "RE_ACHN_a"      "RE_ACHN_b"      "RE_CAKI1_a"     "RE_CAKI1_b"    
## [113] "RE_RXF393_a"    "RE_RXF393_b"    "RE_SN12C_a"     "RE_SN12C_b"    
## [117] "RE_TK10_a"      "RE_TK10_b"      "RE_UO31_a"      "RE_UO31_b"     
## [121] "Protein"

2. Improve figure clarity

## 2.1 Deal with overplotting

# Basic scatter plot
p <- ggplot(CRC, aes(x = SERPINA3, y = TIMP1))
p + geom_point()

# moves each point by a small, random amount
p + geom_jitter(width = 0.25)

# For larger datasets
s <- ggplot(NCI60, aes(BR_BT549_a, BR_HS578T_a))
s + geom_point()

# Change the shape from solid to hollow circles
s + geom_point(shape = 1) 

# Pixel sized
s + geom_point(shape = ".") 

# use alpha blending (transparency) to make the points transparent
# If you specify alpha as a ratio, the denominator gives the number of points that must be overplotted to give a solid colour.
s + geom_point(alpha = 1 / 3)

s + geom_point(alpha = 1 / 5)

s + geom_point(alpha = 1 / 10)

## 2.2 Order bar chart
g <- ggplot(CRC, aes(Sub_group)) 
g + geom_bar()

subgroup <- CRC %>% group_by(Sub_group) %>% summarise(n = n()) # count the number of samples for each sub group
subgroup
## # A tibble: 3 × 2
##   Sub_group     n
##      <fctr> <int>
## 1    Benign    34
## 2       CRC   100
## 3   Healthy    66
subgroup <- subgroup[order(subgroup$n), ]  # sort
subgroup$Sub_group <- factor(subgroup$Sub_group, levels = subgroup$Sub_group)  # to retain the order in plot.
subgroup
## # A tibble: 3 × 2
##   Sub_group     n
##      <fctr> <int>
## 1    Benign    34
## 2   Healthy    66
## 3       CRC   100
ggplot(subgroup, aes(x=Sub_group, y=n)) + 
  geom_bar(stat="identity")

## 2.3 Zooming
# Change the limit of x-axis and y-axis
h <- ggplot(CRC, aes(SERPINA3))

# Change the bar width
h + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

h + geom_histogram(binwidth = 0.1)

# Change the limit of x-axis and y-axis
h + geom_histogram(binwidth = 0.1) +
  coord_cartesian(xlim = c(12.5, 16))

# Color specific point in scatter plot
ggplot(data = CRC) +
  geom_point(mapping = aes(x = SERPINA3, y = TIMP1))

CRC2 <- CRC
highlight.sample <- "P1D2"
CRC2$highlight <- ifelse(CRC2$Sample == highlight.sample, "highlight", "normal")
textdf <- CRC2[CRC2$Sample == highlight.sample, ]
mycolours <- c("highlight" = "red", "normal" = "grey50")

textdf
##        A1AG2      AFM    AHSG AIAG.Bovine     ANT3     AOC3     APOB
## 115 14.43676 15.68612 19.4506    15.06871 17.12417 9.772163 15.41505
##         ATRN      BTD  C20orf3    CADM1    CD163     CD44     CDH5
## 115 13.33803 15.87152 10.65438 8.843735 11.22391 9.635675 8.581254
##          CFH      CFI      CLU       CP     CTSD DKFZp686N02209     DSG2
## 115 17.39842 17.52245 19.63051 15.54674 9.689814       20.61501 10.41874
##         ECM1      F11       F5    FCGBP FETUA.Bovine    FETUB      FGA
## 115 12.30604 15.25164 12.21818 12.18561     17.07109 13.92578 8.627747
##          FGG     FHR3      FN1    GOLM1      HP      HRG    HYOU1   ICAM1
## 115 10.34984 10.57234 12.41536 8.566989 20.7664 16.89356 8.362621 8.47747
##      IGFBP3   IGHA2    IGHG2    ITIH4    KLKB1     KNG1    LAMP2    LCN2
## 115 9.71304 18.1794 21.37353 15.34769 13.75194 17.13578 14.36741 9.43319
##     LGALS3BP     LRG1      LUM    LYVE1    MMRN1      MPO     MRC2
## 115 14.05995 14.73259 15.30533 12.07749 11.40201 11.78676 10.82856
##         MST1    NCAM1     ORM1     PGCP     PIGR     PLTP   PLXDC2    PON1
## 115 12.66785 10.44025 16.78709 8.633163 8.614645 11.44895 9.736548 16.0177
##         PRG4     PROC    PTPRJ   Q5JNX2 SERPINA1 SERPINA3 SERPINA6
## 115 12.33507 9.898844 9.300791 20.02495 16.39812 14.48573 15.47579
##     SERPINA7   THBS1    TIMP1      TNC      VTN      VWF Sample   Group
## 115 13.08085 15.1787 11.91644 10.15539 12.28754 10.89411   P1D2 Healthy
##     Age Gender Cancer_stage Tumour_location Sub_group highlight
## 115  68 female           NA            <NA>   Healthy highlight
ggplot(data = CRC2, aes(x = SERPINA3, y = TIMP1)) +
  geom_point(size = 3, aes(colour = highlight)) +
  scale_color_manual("Sample", values = mycolours) +
  geom_text(data = textdf, aes(x = SERPINA3, y = TIMP1* 0.99, label = highlight.sample), colour = "red")

## 2.4 Change labels, themes, and scales
# 2.4.1 titles, subtitles and captions
p1 <- ggplot(data = CRC) +
  geom_point(mapping = aes(x = SERPINA3, y = TIMP1, color = Sub_group))
p1 + labs(title = "Compare between sub groups",
          subtitle = "Benign samples are mixed with the other two groups",
          caption = "Data vis example")

# Axis labels and legend titles
p1 + labs(x = "Protein SERPINA3", y = "Protein TIMP1", color = "Sub groups")

# 2.4.2 Theme: Change appearance of non-data elements
p1 + theme_grey()

p1 + theme_classic()

p1 + theme_dark()

p1 + theme_light()

p1 + theme_void()

p1 + theme(panel.background = element_rect(fill = "white", colour = "grey50"))

CRC.two.prot <- CRC[,c("SERPINA3","TIMP1","Sample")]
plot.data <- CRC.two.prot[1:20,] %>% gather(Protein, Abundance, -Sample)

p2 <- ggplot(plot.data) + 
  geom_line(aes(x=Sample, y = Abundance, group = Protein, colour=Protein))

#  Change the appearance and the orientation angle of axis labels
#  Reference: http://ggplot2.tidyverse.org/reference/theme.html
p2 + theme(axis.text.x = element_text(face="bold", color="blue", size=10, angle=45),
          axis.title.x = element_text(face="bold", colour="#990000", size=20),
          axis.text.y = element_text(face="bold", color="blue", size=14),
          axis.title.y = element_text(face="bold", colour="#990000", size=20))

# Hide x an y axis tick mark labels
p2 + theme(
  axis.text.x = element_blank(),
  axis.title.x = element_blank(),
  axis.text.y = element_blank(),
  axis.title.y = element_blank())

# Remove axis ticks and tick mark labels
p2 + theme(
  axis.ticks = element_blank())

# Adjust Legend
p2 + theme(legend.position = "right") # the default

p2 + theme(legend.position = "bottom")

p2 + theme(legend.position = "none")

# Strips
# Facetting creates tables of graphics by splitting the data into subsets and displaying the same graph for each subset
p3 <- ggplot(CRC) + 
  geom_point(aes(x = SERPINA3, y = TIMP1)) +
  facet_grid(~ Group)
p3

p3 + theme(strip.background = element_rect(colour = "white", fill = "yellow"))

# 2.4.3 Scales: control the appearance of data elements.
# A scale function for each aesthetic.
# change color themes
p1 + scale_color_grey()

p1 + scale_color_brewer(palette = "Set1")

# change the axis scales
p1 + scale_x_log10() 

p1 + scale_y_log10()

p1 + scale_x_continuous(name="Protein SERPINA3", limits=c(12, 16)) +
  scale_y_continuous(name="Protein TIMP1")
## Warning: Removed 15 rows containing missing values (geom_point).

Challenge Plot the scatter plot of the abundance of protein AFM and AHSG among all the samples and highlight samples P1A1, P1B1 and P1B12.

\(~\)

\(~\)

\(~\)

\(~\)

\(~\)

\(~\)

\(~\)

\(~\)

\(~\)

\(~\)

CRC2 <- CRC
highlight.sample <- c("P1A2", "P1B1","P1B12")
CRC2$highlight <- ifelse(CRC2$Sample %in% highlight.sample, "highlight", "normal")
textdf <- CRC2[CRC2$Sample %in% highlight.sample, ]
mycolours <- c("highlight" = "red", "normal" = "grey50")
ggplot(data = CRC2, aes(x = AFM, y = AHSG)) +
  geom_point(size = 3, aes(colour = highlight))+
  scale_color_manual("Sample", values = mycolours) +
  geom_text(data = textdf, aes(x = AFM*0.99, y = AHSG, label = highlight.sample), colour = "red")

Challenge Zoom in the figure you just plot to AFM=15.5-16.2

\(~\)

\(~\)

\(~\)

\(~\)

\(~\)

\(~\)

\(~\)

\(~\)

\(~\)

\(~\)

ggplot(data = CRC2, aes(x = AFM, y = AHSG)) +
  geom_point(size = 3, aes(colour = highlight))+
  scale_color_manual("Sample", values = mycolours) +
  geom_text(data = textdf, aes(x = AFM*0.995, y = AHSG, label = highlight.sample), colour = "red")+
coord_cartesian(xlim=c(15.5,16.2))